'''


exec(''.join(open("/home/dalton_m/ppp/pyfiles/ANppp_ldbv1.py", encoding="utf8").readlines()[:]))
nohup python3 /home/dalton_m/ppp/pyfiles/ANppp_ldbv1.py  | tee &
'''

####whether or not to make step1 data
Dstep1 = 0

import os

import pandas as pd

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
import sys

sys.path.append('/home/dalton_m/payload')
from basicfunctions import *

filename = 'ANppp_ldbv1'

resultsloc1 = "/dataERS/eract/daltonm/results/ppp/"

from datetime import date
datestr = date.today().strftime(format="%Y%m%d")
resultsloc = resultsloc1 + datestr + '/'
if not os.path.exists(resultsloc):
    os.makedirs(resultsloc)


logging.basicConfig(filename=resultsloc + filename + '.txt', level=logging.ERROR,
                    format='%(asctime)s - %(levelname)s - %(message)s')
logging.debug('This message should go to the log file')
logging.info('So should this')
logging.warning('And this, too')
logging.exception('And this, too')
logging.captureWarnings(True)

sys.path.append(resultsloc1 + 'pyfiles/')

#the quarter data is available
q=8
moncount = q*3 + 1
kwvar = 'monthly_wage_avg_19'


if Dstep1 == 1:
    #bring together each quarter of LDB extract for '21
    cols = ['ldb_num', 'fips', 'ui_acct', 'run_num', 'ein', 'cnty', 'own', 'naics', 'meei',
            'emp_1', 'emp_2', 'emp_3', 'wages', 'latitude', 'longitude', 'legal_name', 'trade_name',
            'telephone', 'rptyrqtr', 'emp_1_flag', 'emp_2_flag', 'emp_3_flag', 'wages_flag', 'setupdate',
            'end_liab_date', 'zip5', 'tract', 'del']
    #columns i need from both new and prev quarter
    kcols1a = ['ldb_num', 'fips', 'ui_acct', 'run_num', 'ein', 'cnty', 'own', 'naics',
            'setupdate',
            'end_liab_date', 'zip5',  ]
    kcols1b = [
            'emp_1', 'emp_2', 'emp_3', 'wages', 'emp_1_flag', 'emp_2_flag', 'emp_3_flag', 'wages_flag', ]
    mc = ['ldb_num']
    for i,f in enumerate(['ldb34115_q421.dat', 'ldb34116_q321.dat',  'ldb34117_q221.dat', 'ldb34118_q121.dat',]):
        renamedict = {
            'emp_'+str(j) : 'emp_m' + str((q- i - 1) * 3 + j) for j in range(1,4)
        }
        renamedict.update({'emp_'+str(j) + '_flag' : 'emp_m' + str((q- i - 1) * 3 + j ) + 'flag' for j in range(1,4)})
        renamedict.update({'wages' : 'wages_q'+str(q- i),
                           'wages_flag' : 'wages_q'+str(q - i)+'_flag'})
        if i == 0:
            df = cudf.read_csv(dataloc + 'ldb/' + f,sep='\t', header=None)
            df.columns = cols
            kcond = (df['meei'] != 2)
            df = df[kcond][kcols1a + kcols1b].rename(columns = renamedict)
        else:
            dft = cudf.read_csv(dataloc + 'ldb/' + f,sep='\t', header=None)
            dft.columns = cols
            kcols = mc + list(renamedict.values()) + kcols1a
            kcond = (dft['meei'] != 2)
            dft = dft[kcond].rename(columns=renamedict)[kcols]
            df = df.merge(dft, on = mc, how = 'outer')
            dft = None
            tcols = [c[:-2] for c in df if c.endswith('_x')]
            for c in tcols:
                df[c]= df[c + '_x'].fillna(df[c + '_y'])
            dcols = [c+'_x' for c in tcols] + [c+'_y' for c in tcols]
            df = df.drop(columns = dcols)

    df['zip5'] = pd.to_numeric(df['zip5'].to_pandas(), errors='coerce')
    #df['tract'] = pd.to_numeric(df['tract'].to_pandas(), errors='coerce')




    '''
    #########################
    ###get 2020 info
    #########################
    '''

    yr = 2020
    qtr = ''
    own = ''
    fips = ''
    #'ui_acct', 'rpt_unit',
    cols = ['ein', 'wages', 'aaemp','meei', 'naics_code' , 'ui_acct', 'rpt_unit']

    dfldb = qcew(yr,qtr,fips,own,cols).rename(columns = {'ui_acct' : 'ui_acct',
                                                         'rpt_unit' : 'run_num',
                                                         'naics_code' : 'naics'})
    dfldb = dfldb[dfldb['meei'].isin([1,3,4,5,6])].drop(columns = 'meei')
    kcols = ['ldb_num', 'emp_m1', 'emp_m2', 'emp_m3', 'emp_m1flag', 'emp_m2flag',
           'emp_m3flag', 'wages_1', 'wages_flag1', 'emp_m4', 'emp_m5', 'emp_m6',
           'emp_m4flag', 'emp_m5flag', 'emp_m6flag', 'wages_2', 'wages_flag2',
           'emp_m7', 'emp_m8', 'emp_m9', 'emp_m7flag', 'emp_m8flag', 'emp_m9flag',
           'wages_3', 'wages_flag3', 'emp_m10', 'emp_m11', 'emp_m12',
           'emp_m10flag', 'emp_m11flag', 'emp_m12flag', 'wages_4', 'wages_flag4',
             'own', 'naics', 'ui_acct', 'run_num', 'ein'
           ]
    renamedict = {'wages_'+str(i): 'wages_q' + str(i) for i in range(1,5)}
    renamedict.update({ 'wages_flag'+str(i): 'wages_q' + str(i) + '_flag' for i in range(1,5)})

    mc = ['ldb_num']
    dfldb = dfldb[kcols].rename(columns = renamedict).drop_duplicates(mc)


    #dflist.append(sqlmerge(dft, dfldb, mc))
    df = df.merge(dfldb, on=mc, how='outer', indicator=False)

    tcols = [c[:-2] for c in df if c.endswith('_x')]
    for c in tcols:
        df[c] = df[c + '_x'].astype('float').fillna(df[c + '_y'].astype('float'))

    dcols = [c+'_x' for c in tcols] + [c+'_y' for c in tcols]
    df.drop(columns=dcols,inplace=True)

    '''
    deal with flags
    '''
    for mon in range(1, moncount):
        # df['emp_m'+str(mon)] = df['emp_m'+str(mon)].to_pandas().fillna(0)
        cond = (df['emp_m'+str(mon)+'flag'] == 'M').to_array()
        df['emp_m' + str(mon)] = np.where(cond, np.NaN, df['emp_m'+str(mon)].to_pandas())

    for mon in range(1, q+1):
        # df['tot_qtr_wages'+str(mon)] = df['tot_qtr_wages'+str(mon)].to_pandas().fillna(0)
        cond = (df['wages_q'+str(mon) + '_flag'] == 'M').to_array()
        df['wages_q' + str(mon)] = np.where(cond, np.NaN, df['wages_q'+str(mon)].to_pandas())

    dropcols = ['emp_m'+str(mon)+'flag' for mon in range(1, moncount)] + ['wages_q'+str(mon)+'_flag' for mon in range(1, q+1)]
    df = df.drop(columns = dropcols)



    '''
    #########################
    ###get 2019 info
    #########################
    '''

    ######get prev year seasonal emp data
    yrm1 = 2019
    fips = ''
    qtr = ''
    own = ''
    # 'ui_acct', 'rpt_unit',
    cols = ['ein',  'aaemp', 'meei','wages', 'naics_code']
    dfldb = qcew(yrm1, qtr, fips, own, cols).rename(columns={'ui_acct': 'ui_acct',
                                                             'rpt_unit': 'run_num',
                                                             'naics_code': 'naics_ldb'})
    dfldb = dfldb[dfldb['meei'].isin([1, 3, 4, 5, 6])].drop(columns='meei')
    dfldb = dfldb[~(((dfldb['emp_m7'].isnull()) | (dfldb['emp_m7'] == 0)) & (
            (dfldb['emp_m8'].isnull()) | (dfldb['emp_m8'] == 0)) & (
                            (dfldb['emp_m9'].isnull()) | (dfldb['emp_m9'] == 0)) & (
                            (dfldb['emp_m10'].isnull()) | (dfldb['emp_m10'] == 0)) & (
                            (dfldb['emp_m11'].isnull()) | (dfldb['emp_m11'] == 0)) & (
                            (dfldb['emp_m12'].isnull()) | (dfldb['emp_m12'] == 0)))]
    dfldb['aaemp'] = dfldb[['emp_m' + str(i) for i in range(1, 13)]].to_pandas().sum(axis=1)
    dfldb['emp_max'] = dfldb[['emp_m' + str(i) for i in range(1, 13)]].to_pandas().max(axis=1)
    dfldb['tot_wages'] = dfldb[['wages_' + str(i) for i in range(1, 5)]].to_pandas().sum(axis=1)
    ### to get number of months with positive employment
    dfldb['emp_count'] = 0
    #multiply qtr x 4 to make it annual
    for mon in range(1, 13):
        dfldb['emp_count'] = dfldb['emp_count'].to_pandas() + 1 * ((dfldb['emp_m'+str(mon)] > 0)).to_array()

    dfldb['aaemp'] = dfldb['aaemp'] / dfldb['emp_count']
    dfldb['monthly_wage_avg'] = dfldb['tot_wages'] / dfldb['emp_count']

    # dfldb['aaemp'] = dfldb['aaemp'] / 12
    # dfldb['monthly_wage_avg'] = dfldb['tot_wages'] / 12

    ###seasonal info
    # dfldb['janmar_emp_ratio'] = ((dfldb['emp_m1'].to_pandas().fillna(0) + dfldb['emp_m2'].to_pandas().fillna(0) + dfldb['emp_m3'].to_pandas().fillna(0)) / 3) / (dfldb['aaemp'].to_pandas())
    # dfldb['aprjun_emp_ratio'] = ((dfldb['emp_m4'].to_pandas().fillna(0) + dfldb['emp_m5'].to_pandas().fillna(0) + dfldb['emp_m6'].to_pandas().fillna(0)) / 3) / (dfldb['aaemp'].to_pandas())
    # dfldb['julsep_emp_ratio'] = ((dfldb['emp_m7'].to_pandas().fillna(0) + dfldb['emp_m8'].to_pandas().fillna(0) + dfldb['emp_m9'].to_pandas().fillna(0)) / 3) / (dfldb['aaemp'].to_pandas())
    # dfldb['octdec_emp_ratio'] = ((dfldb['emp_m10'].to_pandas().fillna(0) + dfldb['emp_m11'].to_pandas().fillna(0) + dfldb['emp_m12'].to_pandas().fillna(0)) / 3) / (dfldb['aaemp'].to_pandas())
    #closures
    mondict = {
        1 : 'jan',
    2 : 'feb',
    3 : 'mar',
    4 : 'apr',
    5 : 'may',
    6 : 'jun',
    7 : 'jul',
    8 : 'aug',
    9 : 'sep',
    10 : 'oct',
    11 : 'nov',
    12 : 'dec',
        }
    for i,mon in mondict.items():
        dfldb[mon + '_closed'] = 1*(dfldb['emp_m'+str(i)].to_pandas().fillna(0) == 0)

    closedmons = [mon + '_closed' for i,mon in mondict.items()]

    #relative to 3 months worth of tot_wage_monthly
    #because this is quarterly ratio
    dfldb['janmar_wages_ratio'] = ((dfldb['wages_1'].to_pandas().fillna(0) ) ) / (3 * dfldb['monthly_wage_avg'].to_pandas())
    dfldb['aprjun_wages_ratio'] = ((dfldb['wages_2'].to_pandas().fillna(0) ) ) / (3 * dfldb['monthly_wage_avg'].to_pandas())
    dfldb['julsep_wages_ratio'] = ((dfldb['wages_3'].to_pandas().fillna(0) ) ) / (3 * dfldb['monthly_wage_avg'].to_pandas())
    dfldb['octdec_wages_ratio'] = ((dfldb['wages_4'].to_pandas().fillna(0) ) ) / (3 * dfldb['monthly_wage_avg'].to_pandas())
    dfldb['emp_feb_june19'] = dfldb[['emp_m'+str(i) for i in range(2,7)]].to_pandas().mean(axis=1)
    for c in [#'janmar_emp_ratio', 'aprjun_emp_ratio', 'julsep_emp_ratio', 'octdec_emp_ratio',
              'janmar_wages_ratio', 'aprjun_wages_ratio', 'julsep_wages_ratio', 'octdec_wages_ratio',
              ]:
        dfldb[c] = dfldb[c].to_pandas().replace(np.inf, np.NaN).fillna(1)
        cond = (dfldb[c] > 1).to_array()
        dfldb[c] = np.where(cond, 1, 0)
    kcols = ['ldb_num','emp_feb_june19',
             #'janmar_emp_ratio', 'aprjun_emp_ratio', 'julsep_emp_ratio', 'octdec_emp_ratio',
             #'janmar_wages_ratio', 'aprjun_wages_ratio', 'julsep_wages_ratio', 'octdec_wages_ratio',
    'aaemp', 'monthly_wage_avg', 'own', 'naics_ldb', 'tot_wages','ein','fips', 'emp_max'
             ] + closedmons
    renamedict = {
        'aaemp' : 'aaemp_19',
    'monthly_wage_avg' : 'monthly_wage_avg_19',
        'own' : 'own_19',
        'naics_ldb' : 'naics_19',
        'tot_wages' : 'tot_wages_19',
        'ein' : 'ein_19',
        'fips' : 'fips_19',
        'emp_max' : 'emp_max_19'
    }
    dfldb = dfldb[kcols].rename(columns = renamedict)

    mc = ['ldb_num']
    #dflist.append(sqlmerge(dft, dfldb, mc))
    df = df.merge(dfldb.drop_duplicates('ldb_num'), on=mc, how='outer', indicator=False)



    #
    #
    # yr = 2019
    # qtr = ''
    # own = ''
    # fips = ''
    # #'ui_acct', 'rpt_unit',
    # cols = ['ein', 'wages', 'aaemp','meei', 'naics_code' ]
    #
    # dfldb = qcew(yr,qtr,fips,own,cols).rename(columns = {'ui_acct' : 'ui_acct',
    #                                                      'rpt_unit' : 'run_num',
    #                                                      'naics_code' : 'naics_ldb'})
    # dfldb = dfldb[dfldb['meei'].isin([1,3,4,5,6])].drop(columns = 'meei')
    # dfldb = dfldb[~(((dfldb['emp_m7'].isnull()) | (dfldb['emp_m7'] == 0)) & (
    #             (dfldb['emp_m8'].isnull()) | (dfldb['emp_m8'] == 0)) & (
    #                             (dfldb['emp_m9'].isnull()) | (dfldb['emp_m9'] == 0)) & (
    #                             (dfldb['emp_m10'].isnull()) | (dfldb['emp_m10'] == 0)) & (
    #                             (dfldb['emp_m11'].isnull()) | (dfldb['emp_m11'] == 0)) & (
    #                             (dfldb['emp_m12'].isnull()) | (dfldb['emp_m12'] == 0)))]
    # #'ui_acct', 'run_num',
    # #, 'ein'
    # kcols = ['aaemp', 'tot_wages', 'emp_m12', 'fips', 'ldb_num', 'naics_ldb', 'ein']
    # dfldb['tot_wages'] = dfldb[['wages_'+str(i) for i in range(1,5)]].to_pandas().sum(axis=1)
    # dfldb['aaemp'] = dfldb[['emp_m'+str(i) for i in range(1,13)]].to_pandas().mean(axis=1)
    # dfldb = dfldb[(dfldb['aaemp'] > 0)]
    # #dfldbein = None
    # # dfldb['ein_aaemp'] = dfldb[['ein', 'aaemp']].to_pandas().groupby('ein')['aaemp'].transform('sum')
    # # dfldb['ein_emp_m12'] = dfldb[['ein', 'aaemp']].to_pandas().groupby('ein')['aaemp'].transform('sum')
    # renamedict4 = {
    #     'aaemp' : 'aaemp_19',
    #     'emp_m12': 'emp_m12_19',
    #     # 'ein_emp_m12': 'ein_emp_m12_19',
    #     # 'ein_aaemp': 'ein_aaemp_19',
    #     'tot_wages': 'tot_wages_19',
    #     'fips' : 'fips_19',
    #     'ein' : 'ein_19',
    # }
    # dfldb = dfldb[kcols].rename(columns = renamedict4)
    #
    # mc = ['ldb_num']
    # #dflist.append(sqlmerge(dft, dfldb, mc))
    # df = df.merge(dfldb.drop_duplicates('ldb_num'), on=mc, how='outer', indicator=False)
    #
    # '''
    # #############################
    # get 2019 EIN info
    # #############################
    # '''
    mc = ['ein_19']
    aggstats = {
        'aaemp_19': 'sum',
        'ldb_num': 'count'
    }
    df= df.merge(
        dfldb[['ein_19', 'aaemp_19', 'ldb_num']].groupby(mc, as_index=False).agg(aggstats).rename(
            columns={'aaemp_19': 'ein_aaemp_19', 'ldb_num': 'num_estab_ein'}).reset_index(), on=mc, how='left', indicator=False)

    dfldb = None



    '''
    previous results showed ~10% of months had closures
    - current results are showing ~1.5% - something is wrong
    
    **** i think it is solved
    *** adding in filler for 0 employment reports so that i get complete set
    '''
    # #get last month reported + last month employment
    df['lastmonth'] = np.NaN
    df['lastmonthnum'] = np.NaN
    for i in range(4,q*3 + 1):
        cond = (df['emp_m' + str(i)]>= 0).to_array()
        df['lastmonth'] = np.where(cond, df['emp_m'+str(i)].to_pandas(), df['lastmonth'].to_pandas())
        df['lastmonthnum'] = np.where(cond, i , df['lastmonthnum'].to_pandas())

    #if month is missing and last reported month is a 0 emp, fill in
    for i in range(4,q*3 + 1):
        cond = ((df['emp_m' + str(i)].isnull()) & (df['lastmonthnum'] < i) & (df['lastmonth'] == 0)).to_array()
        df['emp_m' + str(i)] = np.where(cond, 0, df['emp_m'+str(i)].to_pandas())

    ####get monthly wage numbers

    df['q1_emp'] = df[['emp_m'+str(i) for i in range(1,4)]].to_pandas().sum(axis=1)
    df['q2_emp'] = df[['emp_m'+str(i) for i in range(4,7)]].to_pandas().sum(axis=1)
    df['q3_emp'] = df[['emp_m'+str(i) for i in range(7,10)]].to_pandas().sum(axis=1)
    df['q4_emp'] = df[['emp_m'+str(i) for i in range(10,13)]].to_pandas().sum(axis=1)
    df['q5_emp'] = df[['emp_m'+str(i) for i in range(13,16)]].to_pandas().sum(axis=1)
    df['q6_emp'] = df[['emp_m'+str(i) for i in range(16,19)]].to_pandas().sum(axis=1)
    df['q7_emp'] = df[['emp_m'+str(i) for i in range(19,22)]].to_pandas().sum(axis=1)
    df['q8_emp'] = df[['emp_m'+str(i) for i in range(22,25)]].to_pandas().sum(axis=1)
    #get proportion of quarter's wage by relative proportion of employment
    for tq in range(1,q+1):
        for mon in range((tq-1)*3 + 1, (tq-1)*3 + 1+3):
            df['wage_m' + str(mon)] = df['wages_q'+str(tq)].to_pandas() * df['emp_m' + str(mon)].to_pandas() / df['q'+ str(tq) +'_emp'].to_pandas()
            cond = ((df['q'+ str(tq) +'_emp']==0) & (df['wage_m' + str(mon)].isnull()) & (df['wages_q'+str(tq)] == 0)).to_array()
            df['wage_m' + str(mon)] = np.where(cond, 0, df['wage_m' + str(mon)].to_pandas())



    #rules in PPP are maintaining employment from Feb-June 2019 OR Jan-Feb 2020.
    #this is the emp for Jan-Feb
    df['emp_jan_feb20'] = df[['emp_m1', 'emp_m2']].to_pandas().mean(axis=1)
    df['emp_20tot'] = df[['emp_m'+str(i) for i in range(1,13)]].to_pandas().sum(axis=1)
    ####the next part is only for PPP stuff
    ####i'll save the employment info with ownership code to trim down on items


    pd.wide_to_long(df[['ldb_num', ] + ['emp_m'+str(mon) for mon in range(1,moncount)] ].to_pandas().drop_duplicates('ldb_num'), stubnames = 'emp_m', i = ['ldb_num', ], j = 'num_month').reset_index().to_csv(dataloc + 'pppfiles/' + filename + '_emp.csv')

    kcols =  ['emp_m'+str(i) for i in range(1,moncount)]
    #kcond = (df['own'] == 5)
    df = df.drop(columns = kcols)



    df['tot_qtr_wages_fill'] = df['tot_wages_19'].copy()
    df['wage_count'] = 0
    df['wage_sum'] = 0
    #multiply qtr x 4 to make it annual
    for mon in range(1, q+1):
        df['wages_q' + str(mon)] = df['wages_q'+str(mon)].to_pandas().fillna(0)
        df['wage_count'] = df['wage_count'].to_pandas() + 1 * ((df['wages_q'+str(mon)] > 0)).to_array()
        df['wage_sum'] = df['wage_sum'].to_pandas() + df['wages_q'+str(mon)].to_pandas().fillna(0)

    df['wage_filler'] = df['wage_sum'] / df['wage_count']
    cond = ( (df['tot_qtr_wages_fill'].isnull())).to_array()
    df['tot_qtr_wages_fill'] = np.where(cond, 4 * df['wage_filler'].to_pandas(), df['tot_qtr_wages_fill'].to_pandas())


    dft = pd.wide_to_long(df[['ldb_num'] + ['wage_m'+str(mon) for mon in range(1,moncount)] ].to_pandas().drop_duplicates('ldb_num'), stubnames = 'wage_m', i = ['ldb_num'], j = 'num_month').reset_index()


    #########################make panel
    ###############cpi adjustment
    ##uses middle month of quarter in CPI calc, compared to August 2021
    cpi2021 = { 1 : 1.04,
                2 : 1.02,
                3 : 1,
                4 : 1,}
    cpi2020 = { 1 : 1.06,
                2 : 1.07,
                3 : 1.05,
                4 : 1.05,}
    cpi2019 = { 1 : 1.08,
                2 : 1.07,
                3 : 1.07,
                4 : 1.06,}
    cpi2018 = { 1 : 1.10,
                2 : 1.09,
                3 : 1.08,
                4 : 1.09,}
    cpi2017 = { 1 : 1.12,
                2 : 1.12,
                3 : 1.11,
                4 : 1.11,}
    cpi2016 = { 1 : 1.15,
                2 : 1.14,
                3 : 1.14,
                4 : 1.13,}
    cpi2015 = { 1 : 1.17,
                2 : 1.15,
                3 : 1.15,
                4 : 1.15,}
    cpiq = {
    2021 : cpi2021,
    2020 : cpi2020,
        2019 : cpi2019,
    2018 : cpi2018,
    2017 : cpi2017,
    2016 : cpi2016,

    }

    ####uses July of year CPI calculator, compared to August 2021
    cpiy = {
        2021 : 1,
        2020: 1.06,
        2019 : 1.07,
    2018 : 1.09,
    2017 : 1.12,
    2016 : 1.14,
    2015 : 1.15,
    }
    df[kwvar] = df[kwvar] * cpiy[2019]
    tdict = {
        1 : cpiq[2020][1],
    2 : cpiq[2020][1],
    3 : cpiq[2020][1],
    4 : cpiq[2020][2],
    5 : cpiq[2020][2],
    6 : cpiq[2020][2],
    7 : cpiq[2020][3],
    8 : cpiq[2020][3],
    9 : cpiq[2020][3],
    10 : cpiq[2020][4],
    11 : cpiq[2020][4],
    12 : cpiq[2020][4],
    13 : cpiq[2021][1],
    14 : cpiq[2021][1],
    15 : cpiq[2021][1],
    16 : cpiq[2021][2],
    17 : cpiq[2021][2],
    18 : cpiq[2021][2],
    19 : cpiq[2021][3],
    20 : cpiq[2021][3],
    21 : cpiq[2021][3],
    22 : cpiq[2021][4],
    23 : cpiq[2021][4],
    24 : cpiq[2021][4],
    }

    dft['wage_m'] = dft['wage_m'] * dft['num_month'].map(tdict)



    dft.to_csv(dataloc + 'pppfiles/' + filename + '_wage.csv')

    kcols =  ['wages_q'+str(i) for i in range(1,q+1)] + ['wage_m'+str(i) for i in range(1,moncount)] + ['q'+str(i) + '_emp' for i in range(1,q+1)]
    #kcond = (df['own'] == 5)
    df = df.drop(columns = kcols)

    '''
    #############################
    closures / openings
    #############################
    '''
    df['setup_yr']= pd.to_numeric(df['setupdate'].to_pandas().apply(lambda x: str(x)[:4]), errors='coerce')
    df['closed_yr']= pd.to_numeric(df['end_liab_date'].to_pandas().apply(lambda x: str(x)[:4]), errors='coerce')

    for c in ['setup_yr', 'closed_yr']:
        cond = ((df[c] < 1900) | (df[c] > 2022)).to_pandas()
        df[c] = np.where(cond, np.NaN, df[c].to_pandas())

    df['setup_month']= pd.to_numeric(df['setupdate'].to_pandas().apply(lambda x: str(x)[4:6]), errors='coerce')
    df['closed_month']= pd.to_numeric(df['end_liab_date'].to_pandas().apply(lambda x: str(x)[4:6]), errors='coerce')


    for c in ['setup_month', 'closed_month']:
        cond = ((df[c] < 1) | (df[c] > 12)).to_pandas()
        df[c] = np.where(cond, np.NaN, df[c].to_pandas())


    ###if employment in last month, then not closed

    ###if it's a possible closure date BUT still has employment, then replace closures
    # cond = (((df['closed_yr'].to_pandas().astype('float') < 2021) | ((df['closed_yr'].to_pandas().astype('float') == 2021) & (df['closed_month'].to_pandas().astype('float') < 9))) & (df['emp_m'+str(moncount-1)].to_pandas() > 0))
    # df['Dfake_closing'] = np.where(cond, 1, 0)


    df['setup_yr'] = df['setup_yr'].fillna(1900)

    cond = (df['setup_yr'] == 2020).to_array()
    df['new_month'] = np.where(cond, df['setup_month'].to_pandas(), np.NaN)
    cond = (df['setup_yr'] == 2021).to_array()
    df['new_month'] = np.where(cond, 12 + df['setup_month'].to_pandas(), df['new_month'].to_pandas())

    '''
    bring in better location info
    
    this pulls from the static address files on kepler and merges them in
    '''

    df21 = cudf.read_csv(dataloc + 'ldb/newgeog' + '21' + '.psv', sep='|')
    df = df.merge(df21, on = ['ldb_num'], how = 'left')
    tcols2 = [c[:-2] for c in df if c.endswith('_x')]
    for c in tcols2:

        df[c] = df[c+'_x'].to_pandas().fillna(df[c+'_y'].to_pandas())

    dcols = [ 'Unnamed: 0', 'msa_code',
           'May 2020 MSA name', 'tract_orig',] + [c+'_x' for c in tcols2] + [c+'_y' for c in tcols2]
    df.drop(columns = dcols,inplace=True)
    df21 = None


    df20 = cudf.read_csv(dataloc + 'ldb/newgeog' + '20' + '.psv', sep='|')
    df = df.merge(df20, on = ['ldb_num'], how = 'left')
    tcols2 = [c[:-2] for c in df if c.endswith('_x')]
    for c in tcols2:
        df[c] = df[c+'_x'].to_pandas().fillna(df[c+'_y'].to_pandas())

    dcols = [ 'Unnamed: 0', 'msa_code',
           'May 2020 MSA name', 'tract_orig',] + [c+'_x' for c in tcols2] + [c+'_y' for c in tcols2]
    df.drop(columns = dcols,inplace=True)
    df20 = None
    ###do 2019 too to try to get as many matches as possible
    df19 = cudf.read_csv(dataloc + 'ldb/newgeog' + '19' + '.psv', sep='|')
    df = df.merge(df19, on = ['ldb_num'], how = 'left')
    tcols2 = [c[:-2] for c in df if c.endswith('_x')]
    for c in tcols2:
        df[c] = df[c+'_x'].to_pandas().fillna(df[c+'_y'].to_pandas())

    dcols = [ 'Unnamed: 0', 'msa_code',
           'May 2020 MSA name', 'tract_orig',] + [c+'_x' for c in tcols2] + [c+'_y' for c in tcols2]
    df.drop(columns = dcols,inplace=True)
    df19 = None

    #fix county
    #may have mixed up what is 5 digit vs 3 digit county
    #changing it so it is only 3 digit
    cond = (df['cnty'] > 1000).to_array()
    df['st_cnty'] = df['cnty'].to_pandas().apply(lambda x: str(x).split('.')[0][:-3])
    df['cnty'] = np.where(cond, df['cnty'].to_pandas().apply(lambda x: float(str(x).split('.')[0][-3:])),df['cnty'].to_pandas())

    '''
    save it
    
    '''
    df['fips'] = df['fips'].to_pandas().fillna(df['fips_19'].to_pandas())
    kcols = ['ldb_num', 'fips', 'ui_acct', 'run_num', 'ein', 'cnty', 'own', 'naics',
            'zip5', 'tract', 'emp_feb_june19',
           'aaemp_19', 'monthly_wage_avg_19', 'own_19', 'naics_19', 'jan_closed',
           'feb_closed', 'mar_closed', 'apr_closed', 'may_closed', 'jun_closed',
           'jul_closed', 'aug_closed', 'sep_closed', 'oct_closed', 'nov_closed',
           'dec_closed',
           'emp_jan_feb20', 'setup_yr', 'setup_month',
           'closed_yr', 'closed_month', 'new_month', 'tot_qtr_wages_fill',
             'ein_aaemp_19', 'st_cnty', 'emp_max_19', 'emp_20tot']

    df = df[kcols]
    '''
    get updated zipcodes
    '''
    tcols = ['ldb_num', 'cnty', 'fipsstate']
    dfg = pd.concat([pd.read_csv(dataloc + 'ldb/address/ldbaddressesv1_'+str(year)+'.psv', sep = '|')[tcols] for year in [2019,2020]]).drop_duplicates()
    dfg['fipscnty'] = dfg['fipsstate']*1000 + dfg['cnty']
    tdict = dict(zip(dfg['ldb_num'], dfg['fipscnty']))
    df['fipscnty'] = df['fips']*1000 + df['cnty']
    df['fipscnty'] = df['fipscnty'].to_pandas().fillna(df['ldb_num'].to_pandas().map(tdict))
    dfg = None
    tdict = None
    df.to_pandas().to_csv(dataloc + 'pppfiles/' + filename + '_step1.csv')

df = cudf.read_csv(dataloc + 'pppfiles/' + filename + '_step1.csv')

'''
##############
ein / employment edits
---not sure how mcuh of this i need
##############
'''


#filling in missing ein
df['ein_aaemp_19_fill2'] = df['ein_aaemp_19'].to_pandas().fillna(df['aaemp_19'].to_pandas())
df['aaemp_19_fill2'] = df['aaemp_19'].copy()
df['aaemp_19_fill'] = df['aaemp_19'].copy()


####wage per worker
df['wage_per_worker'] = df['tot_qtr_wages_fill'] / (df['aaemp_19_fill2'])
df['wage_per_worker'] = df['wage_per_worker'].replace([np.inf], np.NaN)

wlabs = ['<20k','>20k,<40k','>40k,<60k','>60k,<80k','>80k',]
wlabs = {i:j for i,j in enumerate(wlabs)}
bins = [-1, 20e3, 40e3, 60e3, 80e3, 9e9]
df['wage_class'] = pd.cut(df['wage_per_worker'].to_pandas(), bins=bins, labels=wlabs.keys(), right=False).astype('float')


####edits for new
'''
this is rough approximate - if an estab just nonresponds 2019 but shows up in 2020, they are not a "birth", though this strategy would treat them as a birth
'''
# sumcols1 = ['aaemp_19',
#             #'emp_m12_19',
#             'emp_m2',
#             #'emp_m3'
#             ]
# for c in sumcols1:
#     cond = (df[c].isnull()).to_array()
#     df['D'+c+'_fill'] = np.where(cond, 0, 1)
#     df[c+'_fill'] = df[c].fillna(0)
#
# sumcols1 = [i+'_fill' for i in sumcols1]

df['run_num'] = pd.to_numeric(df['run_num'].to_pandas(), errors='coerce').fillna(0)

#kwvar = 'monthly_wage_avg_19'
# & (df[kwvar] > 0)
# cond = (df['aaemp_19'] > 0)  & ~(df['naics2'].isin(['92', '99']))
# df = df.loc[cond]


'''
######################################################
ppp data
##################################################################################################
'''


####size class
slabs = categoricalvariables('size')
bins = [-1, 10, 50, 250, 500, 9e9]
df['size_class'] = pd.cut(df['ein_aaemp_19_fill2'].to_pandas(), bins=bins, labels=slabs.keys(), right=False).astype('float')

s2labs = ['0', '500']
bins = [-1, 500, 9e9]
df['size_class_2'] = pd.cut(df['ein_aaemp_19_fill2'].to_pandas(), bins=bins, labels=s2labs, right=True).astype('float')



####multi size class
# cond = (df['num_estab_ein'] > 1).to_array()
# df['Dmulti'] = np.where(cond, 1, 0)
#if estab number is missing, move to employment version
# & (df['num_estab_ein'].isnull())
cond = ((df['ein_aaemp_19_fill2'] > df['aaemp_19_fill2'])).to_array()
df['Dmulti'] = np.where(cond, 1, 0)

df['size_class_multi'] = ''
for t in s2labs[1:]:
    t=float(t)
    cond = ((df['size_class_2'] == t) & (df['Dmulti'] == 1)).to_array()
    cond1 = ((df['size_class_2'] == t) & (df['Dmulti'] == 0)).to_array()
    df['size_class_multi'] = np.where(cond, 'm' + str(int(t)), np.where(cond1, 's' + str(int(t)), df['size_class_multi'].to_pandas()) )

t = 0
cond = (df['size_class_2'] == t).to_array()
df['size_class_multi'] = np.where(cond, '<500',df['size_class_multi'].to_pandas())


naicsdict = naics2c()
tdict = {v: k for k, v in categoricalvariables('naics').items()}

df['naics'] = df['naics'].to_pandas().fillna(df['naics_19'].to_pandas())
df['naics2dig'] = df['naics'].astype('str').str[:2]
df['naics2'] = df['naics2dig'].map(naicsdict).map(tdict)


####open up PPP

mc = ['ldb_num']
###ldb specific info
dfp = opennew('pppfiles/pppagg2020', ['LoanAmount', 'DateApproved',  'ldb_num']).drop_duplicates(mc)
df = df.merge(dfp, on = mc, how = 'left')

mc = ['ein']
dfp = opennew('pppfiles/pppagg2020', ['LoanAmount_ein', 'DateApproved_ein',  'ein']).drop_duplicates(mc)
df = df.merge(dfp, on = mc, how = 'left')
dfp = None


####open up EIDL

mc = ['ldb_num']
###ldb specific info
dfp = opennew('pppfiles/combinedeidl', ['D_eidl_grant', 'D_eidl_loan', 'ldb_num', 'eidlloan_date', 'eidlgrant_date']).drop_duplicates(mc)
df = df.merge(dfp, on = mc, how = 'left')
dfp = None
###ldb specific info - ppp 2021
rdict = {'LoanAmount' : 'LoanAmount_2021',
'DateApproved' : 'DateApproved_2021',
         }
dfp = opennew('pppfiles/pppagg2021', ['LoanAmount',  'ldb_num', 'DateApproved']).drop_duplicates(mc).rename(columns = rdict)
df = df.merge(dfp, on = mc, how = 'left')
dfp = None



#industry edit
df['naics2'] = df['naics'].astype('str').str[:2]
cond1 = (df['naics2'].isin(['31','32','33'])).to_array()
cond2 = (df['naics2'].isin(['44', '45'])).to_array()
cond3 = (df['naics2'].isin(['48', '49'])).to_array()
df['naics2'] = np.where(cond1, '31', np.where(cond2, '44', np.where(cond3, '48', df['naics2'].to_pandas())))
###get cutoff score from EIN as opposed to
###needed for cutoff number
df['aaemp'] = df['aaemp_19'].copy()
df['ui_aaemp'] = df['ein_aaemp_19'].copy()
##franchise
einlist = cudf.read_csv(dataloc + 'pppfiles/sba_franchise_fullymerged.csv', sep='|')[
    'ein'].unique().to_pandas().tolist()
tdict = {i : 1 for i in einlist}
df['Dfranchise'] = df['ein'].map(tdict).fillna(0)
einlist = None
tdict = None
sys.path.append("/home/dalton_m/ppp")
from rona_estimatefunctions_gpu import *

df = industrycutoffs(df)

conda = (df['cutoff_score'].notnull()).to_pandas()
cond = (df['cutoff_score'] <= 1).to_pandas()
df['Deligible'] = np.where(conda, np.where(cond,1, 0), np.NaN)

####other LDB info
yr = 19
kcols = ['owner_code',
         'hhi',  ] + [ 'urban_classification', 'yr_growth',]

mc = ['ldb_num']
dfldb = ldbdata(2000 + yr, kcols, '')
df = df.merge(dfldb, on = mc, how = 'left')

dfldb = None



cond = (df['LoanAmount'] > 0).to_pandas()
df['Dppp'] = np.where(cond, 1, 0)
# dummy for ppp receipt at EIN level
cond = ((df['LoanAmount'] > 0) | (df['LoanAmount_ein'] > 0)).to_array()
df['Dppp_ein'] = np.where(cond, 1, 0)
#dummy for ppp 2021 receipt
cond = (df['LoanAmount_2021'] > 0)
df['Dppp2021'] = np.where(cond.to_array(),1,0)
###

###need to recalculate ein_aaemp. Something isn't right
#df['ein_aaemp_19'] = df[['ein', 'aaemp_19']].to_pandas().groupby('ein')['aaemp_19'].transform('sum')

####if EIN has less than $500 per employee, then treat it as missing
df['amt_peremp_ein'] = df['LoanAmount_ein'] / df['ein_aaemp_19']
cond = (df['amt_peremp_ein'] < 500).to_pandas()
df['LoanAmount_ein'] = np.where(cond, np.NaN, df['LoanAmount_ein'].to_pandas())
df['DateApproved_ein'] = np.where(cond, np.NaN, df['DateApproved_ein'].to_pandas())

###get amount and date approved
df['approval_date'] = pd.to_datetime(df['DateApproved_ein'].to_pandas().fillna(df['DateApproved'].to_pandas()), errors='coerce')
df['amount_final'] = df['LoanAmount_ein'].to_pandas().fillna(df['LoanAmount'].to_pandas()).fillna(0)

cond = (df['amount_final'] > 0).to_pandas()
df['Dppp_final'] = np.where(cond, 1, 0)


'''
*******************************
here's the plan:
i want to REMOVE eligible establishments that did NOT get a loan. This was a choice and therefore do not make a good control group. 

So the control group then becomes - establishments NOT eligible for PPP, and the not-yet-treated group

Then do a 5% sample
*******************************
'''
#df = df.loc[((df['Dppp_ein'] == 1) | (df['Deligible'] == 0))]
print('***************as of 3/15 - 150 million, includes government')
print('***************as of 8/27/22 - 152 million, includes government')
print('total 2019 employment')
print(df['aaemp_19'].sum()/1e6)

############# 5% sample
import random
random.seed(69)
##### & (df['emp_max_19'] >= 2)
#####keeping observations that employed someone in 2019 (therefore max monthly employment is 2 or more
####& ~(df['naics'].isin([814110, 624120]))
####consider dropping Private households and healthcare assistance
#######very few match to PPP so may not even be eligible???
df['own'] = df['own'].astype('float').fillna(df['own_19'].astype('float')).fillna(df['owner_code'].astype('float'))
kcond = ((df['own'] == 5) & (df['aaemp_19'].notnull())  & (df['fipscnty'] > 0) & (df['fipscnty'] <= 99999)).to_array()
ldblist = df[kcond]['ldb_num'].to_pandas().unique().tolist()
###cut sample in quarter
samplecut = .20
samplist= random.sample(ldblist, int(len(ldblist) * samplecut))
df = df.loc[(df['ldb_num'].isin(samplist))]


###get months
df['ppp_month'] = pd.to_datetime(df['DateApproved'].to_pandas(), errors='coerce').dt.month
df['ppp_day'] = pd.to_datetime(df['DateApproved'].to_pandas(), errors='coerce').dt.day
df['ppp_ein_month'] = df['approval_date'].dt.month
df['ppp_ein_day'] = df['approval_date'].dt.day
###### QCEW ref period is 12th of month, so make PPP relative to that
cond = (df['ppp_ein_day'] <= 12).to_array()
df['ppp_ein_month'] = np.where(cond, df['ppp_ein_month'].to_pandas() , df['ppp_ein_month'].to_pandas() + 1)
cond = (df['ppp_day'] <= 12).to_array()
df['ppp_month'] = np.where(cond, df['ppp_month'].to_pandas() , df['ppp_month'].to_pandas() + 1)

df['ppp_month'] = df['ppp_month'].to_pandas().fillna(0)
df['ppp_ein_month'] = df['ppp_ein_month'].to_pandas().fillna(0)

# editing urban class
cond = (df['urban_classification'] >= 5).to_array()
df['urban_classification'] = np.where(cond, 5, df['urban_classification'].to_pandas())
df['urban_classification'] = df['urban_classification'].to_pandas().fillna(7)



####bank information
# dfb = opennew('pppfiles/mergedbank', ['min_dist_bank_10', 'ldb_num']).drop_duplicates('ldb_num')
# df = df.merge(dfb[['ldb_num', 'min_dist_bank_10']], on = ['ldb_num'], how = 'left')
# dfb = dfb.merge(ldbdata(2000 + yr, ['ui_acct', 'fipsstate'], ''), on=['ldb_num'], how = 'inner')
#
# dfb = dfb.groupby(['fipsstate', 'ui_acct'], as_index=False)[['min_dist_bank_10']].mean().rename(columns = {'min_dist_bank_10' : 'min_dist_bank_10_uiacct'}).reset_index()
# df = df.merge(dfb, on = ['fipsstate', 'ui_acct'], how = 'left')
# dfb = None
#
# df['min_dist_bank_10'] = df['min_dist_bank_10'].to_pandas().fillna(df['min_dist_bank_10_uiacct'].to_pandas()).fillna(100)
# bins = [-1,.5,1,5,9e9]
# distbins = ['0', '1', '2', '3',]
# df['bank_dist_bins'] = pd.cut(df['min_dist_bank_10'].to_pandas(), bins=bins, labels = distbins).astype('str')
# df['bank_dist_bins'] = df['bank_dist_bins'].fillna('4')
# cond = (df['bank_dist_bins'] == 'nan').to_array()
# df['bank_dist_bins'] = np.where(cond, ['4']*len(cond), df['bank_dist_bins'].to_pandas())

#for missings, fill in 0
df['yr_growth'] = df['yr_growth'].to_pandas().fillna(0)


###ein size class
tlabs = ['0','1', '10', '50', '500', '5000']
bins = [-1, 1,10, 50, 500, 5000, 9e9]
df['ein_size_class'] = pd.cut(df['ein_aaemp_19'].to_pandas(), bins=bins, labels=tlabs, right=True).astype('float')


### size class
tlabs = ['0', '10', '50', '100',]
bins = [-1, 10, 50, 100, 9e9]
df['size_class'] = pd.cut(df['aaemp_19'].to_pandas(), bins=bins, labels=tlabs, right=False).astype('float')

df['size_class_multi'] = ''
for t in tlabs[:]:
    t=float(t)
    cond = ((df['size_class'] == t) & (df['ein_aaemp_19'] > df['aaemp_19'])).to_array()
    cond1 = ((df['size_class'] == t) & (df['ein_aaemp_19'] <= df['aaemp_19'])).to_array()
    df['size_class_multi'] = np.where(cond, 'm' + str(int(t)), np.where(cond1, 's' + str(int(t)), df['size_class_multi'].to_pandas()) )

####filling in missing HHI
# df['hhi_stnaics2mean'] = df[['hhi','fipsstate', 'naics2']].to_pandas().groupby(['fipsstate', 'naics2'])['hhi'].transform("mean")
# df['hhi'] = df['hhi'].to_pandas().fillna(df['hhi_stnaics2mean'].to_pandas())
# tlabs = ['0', '1000',  '2500',]
# bins = [-1, 1000, 2500, 9e9]
# df['hhi_cut'] = pd.cut(df['hhi'].to_pandas(), bins=bins, labels=tlabs, right=False).astype('float')
#
# tlabs = ['0', '1000',]
# bins = [-1, 1000,  9e9]
# df['hhi_cut2'] = pd.cut(df['hhi'].to_pandas(), bins=bins, labels=tlabs, right=False).astype('float')

###wages
#avg wage cut

bins = [-1, 20e3, 40e3, 60e3, 80e3, 1e999]
wagelist = [i for i in range(1,6)]
df['avg_wages_bin'] = pd.cut(df['wage_per_worker'].to_pandas(), bins=bins, labels=wagelist).astype('float')

####age cut
df['age'] = 2020 - df['setup_yr']

bins = [-9e9, 5, 10, 20,9e9]
agelist = ['0', '6', '11', '21']
df['age_bins'] = pd.cut(df['age'].to_pandas(), bins=bins, labels=agelist, right=True).astype('float')

####poverty
# dfcest = cudf.read_csv(dataloc + 'pppfiles/countypoverty.csv')
#
#
# mc = ['fipsstate', 'fipscounty']
# df = df.merge(cudf.DataFrame(dfcest), on = mc, how = 'left')
# tlabs = ['0', '10',  '15',]
# bins = [-1, .10, .15,  9e9]
# df['poverty_cut'] = pd.cut(df['pct_poverty'].to_pandas(), bins=bins, labels=tlabs, right=False).astype('float')
#
# df['poverty_cut']= df['poverty_cut'].to_pandas().fillna(999)


################################################################################################################################
################################################################################################################################
################################################################################################################################
################################################################################################################################
### this is the part that needs adjusting
#####8/28- not sure what needs adjusting
################################################################################################################################
################################################################################################################################
################################################################################################################################
################################################################################################################################
df1 = pd.read_csv(dataloc + 'pppfiles/' + filename + '_emp.csv')
#only keep values needed for PPP sample
df1 = df1[df1['ldb_num'].isin(samplist)]
#closure info - merge in later
dfc = df1[df1['emp_m'].notnull()][['ldb_num', 'num_month']].groupby('ldb_num', as_index=False)['num_month'].max()
#idea being, goes into previous quarter
cond = (dfc['num_month'] < moncount - 5)
dfc['Dclose_perm'] = np.where(cond,1 , 0)
dfc = dfc[['ldb_num', 'Dclose_perm']]
##merge
df1 = df1.merge(dfc, on = ['ldb_num'], how = 'left')
cond = ((df1['Dclose_perm'] == 1) & (df1['emp_m'].isnull()))
df1['emp_m'] = np.where(cond, 0, df1['emp_m'])
###secondary closure measure to get a mid-year missings
dfc = df1[(df1['num_month'] >= moncount - 5)].groupby('ldb_num', as_index=False)['emp_m'].sum()
cond = (dfc['emp_m'] == 0)
dfc['Dclose_perm2'] = np.where(cond,1 , 0)
dfc = dfc[['ldb_num', 'Dclose_perm2']]
##merge
df1 = df1.merge(dfc, on = ['ldb_num'], how = 'left')
cond = ((df1['Dclose_perm2'] == 1) & (df1['emp_m'].isnull()))
df1['emp_m'] = np.where(cond, 0, df1['emp_m'])
dfc = None

##total observations
df1['tot_obs'] = df1[['ldb_num', 'emp_m']].groupby('ldb_num')['emp_m'].transform('count')



####this still includes closures, right? or does it not?
totobs = q*3
kcond = (df1['tot_obs'] == totobs)
df1 = df1[kcond][['ldb_num', 'emp_m', 'num_month']]


#####merge in the prev year info. The idea is to take average of SAME month employment / wages
###use this as baseline
df1['emp_avg'] = np.NaN
df1['wage_avg'] = np.NaN

df1 = df1.merge(pd.read_csv(dataloc + 'ldb/'+'CRldb_ppp_yearempv2' + str(2019) + '.csv'), on = ['ldb_num'], how = 'left', indicator=False)

for j in range(1,moncount):
    cond = (df1['num_month'] == j)
    j1 = ((j-1) % 12) + 1
    df1['emp_avg'] = np.where(cond, df1['emp_m'+str(j1)+'_avg'] , df1['emp_avg'])
    df1['wage_avg'] = np.where(cond, df1['wage_m' + str(j1) + '_avg'], df1['wage_avg'])

df1 = df1[['ldb_num', 'emp_m', 'num_month', 'emp_avg', 'wage_avg']]


closedmons = [c for c in df if c.endswith('_closed')]
kcols = ['ldb_num', 'fips', 'ui_acct', 'run_num', 'ein', 'cnty',  'naics',
       'zip5',
       'aaemp_19', 'ein_aaemp_19','emp_max_19',
         #'num_estab_ein',
         'setup_yr', 'setup_month', 'closed_yr', 'closed_month',
       'new_month',  'size_class', 'size_class_multi', 'naics2', 'LoanAmount',
       'LoanAmount_ein',  'D_eidl_grant', 'D_eidl_loan',
       'Dfranchise', 'cutoff_score', 'yr_growth',
       'fipscnty', 'urban_classification', 'Deligible', 'Dppp','Dppp2021', 'Dppp_ein',
         'amount_final',  'ppp_ein_month',
       'ein_size_class', 'avg_wages_bin', 'age_bins', 'DateApproved_2021', 'eidlloan_date', 'eidlgrant_date',
         'emp_20tot', 'emp_jan_feb20'] + closedmons


df = df[kcols].to_pandas().merge(df1, on = ['ldb_num'], how = 'inner')

df1 = None
df1 = pd.read_csv(dataloc + 'pppfiles/' + filename + '_wage.csv')
df1 = df1[df1['ldb_num'].isin(samplist)]

df = pd.DataFrame(df).merge(df1, on = ['ldb_num', 'num_month'], how = 'left')
df1 = None

'''
this gets measure of PPP 2021 month of receipt
'''
##gives month number dict based on 2021 approval
##month since Jan 2020, minus 1 for anticipation effects
tdict = {
1 : 12,
    2 : 13,
3 : 15,
4 : 16,
5 : 17,
6 : 18,
}
df['ppp21_num_month'] = df['DateApproved_2021'].str[:2].astype('float').map(tdict)
cond = (df['num_month'] >= df['ppp21_num_month'])
df['Dppp21_month'] = np.where(cond,1,0)
###same for EIDL
#all are in 2020
tdict = {
i : i-1 for i in range(1,13)
}
df['eidlloan20_num_month'] = df['eidlloan_date'].dt.month.map(tdict)
cond = (df['num_month'] >= df['eidlloan20_num_month'])
df['Deidlloan20_month'] = np.where(cond,1,0)
df['eidlgrant20_num_month'] = df['eidlgrant_date'].dt.month.map(tdict)
cond = (df['num_month'] >= df['eidlgrant20_num_month'])
df['Deidlgrant20_month'] = np.where(cond,1,0)

'''
the closure info is wonky, need something better
this pulls from LDB
'''
# cols = ['ldb_num', 'end_liab','last_emp','yrqtr', 'del']
# mc = ['ldb_num']
# f = 'ldb34017_closedq321.dat'
# dft = cudft.read_csv(dataloc + 'ldb/' + f,sep='\t', header=None)
# dft.columns = cols
# dft = dft.sort_values(['ldb_num', 'yrqtr']).drop_duplicates(subset='ldb_num', keep='last')
# dft['liab_year'] = pd.to_numeric(dft['end_liab_date'].to_pandas().apply(lambda x: str(x)[:4]), errors='coerce')
# dft['last_year'] = pd.to_numeric(dft['last_emp'].to_pandas().apply(lambda x: str(x)[:4]), errors='coerce')
# cond = ((dft['liab_year'] > 2019) | (dft['last_year'] > 2019)).to_pandas()
# dft['Dclosed_step1'] = np.where(cond, 1, 0)
# kcols = ['Dclosed_step1']
# df = df.merge(dft[kcols + mc], on = mc, how = 'left')
# dft = None
#
#
# cond = ((df['closed_yr'] > 2019) & ((df['emp_m'] == 0) | (df['emp_m'].isnull()))).to_pandas()
# df['Dclosed'] = np.where(cond, 1, 0)
'''
did close from step above, but need to bring it over to wage info, too
'''
cond = (df['emp_m'] == 0)
df['wage_m'] = np.where(cond, 0, df['wage_m'])

cond = (df['emp_m'] == 0)
conda = (df['emp_m'].isnull())
df['Dclosed'] = np.where(cond, 1, np.where(conda, np.NaN, 0))


df['pct_emp'] = 100 * df['emp_m'] / df['emp_avg']
df['pct_wage'] = 100 * df['wage_m'] / df['wage_avg']
for c in ['emp', 'wage']:
    cond = (df[c+'_avg'] == 0)
    df['pct_'+c] = np.where(cond, 100, df['pct_'+c].replace(np.inf,np.NaN))

##does this edit in final results
# print("number lost due to max value restrictions")
# maxvalue = 10000
# kcond = (df['pct_emp'] < maxvalue) & (df['pct_wage'] < maxvalue)
# print(len(df))
# df = df.loc[kcond]
# print(len(df))


#missing fips for some reason
#df['fipscnty'] = df['fips'] * 1000 + df['fipscounty']

###using avg of prev year
df['pct_emp_1']  = (100 * df['emp_m'] / df['aaemp_19']).replace(np.inf,np.NaN)
###using emp of jan 2020
#need to merge in jan emp
mc = ['ldb_num']
dft = df[df['num_month'] == 1][['ldb_num', 'emp_m']].rename(columns = {'emp_m' : 'emp_jan20'}).drop_duplicates(mc)
df = df.merge(dft, on = mc, how='left')
df['pct_emp_2']  = (100 * df['emp_m'] / df['emp_jan20']).replace(np.inf,np.NaN)
dft = None

#creates dummy for if they meet PPP employment threshhold
# df['temp_jf20'] = (df['emp_m'] / df['emp_jan_feb20']).replace(np.inf,np.NaN)
# #df['temp_fj19'] = (df['emp_m'] / df['emp_feb_june19']).replace(np.inf,np.NaN)
# #cond = ((df['temp_jf20'] >= 1) | (df['temp_fj19'] >= 1))
# cond = ((df['temp_jf20'] >= 1) )
# df['Dppp_employment_verify'] = np.where(cond, 1, 0)
# #version looking at 95%
# cond = (((df['temp_jf20'] >= .80) & (df['temp_jf20'] < 1)))
# df['Dppp_employment_verify_80_99'] = np.where(cond, 1, 0)


#dfces.to_pandas().to_csv(dataloc + 'pppfiles/' + filename + '_unbalanced.csv')
kcols = list(set(['naics2', 'fips',  'age_bins', 'avg_wages_bin', 'urban_classification',
                                                  'ein_size_class', 'size_class', 'size_class_multi',  'Dclosed', 'pct_emp','pct_wage',
         'yr_growth', 'ppp_ein_month', 'ldb_num', 'num_month', 'age_bins', 'bank_dist_bins',
         'avg_wages_bin', 'Deligible', 'Dfranchise', 'fipscnty','aaemp_19','cutoff_score','amount_final', 'LoanAmount',
 'LoanAmount_ein','ein','ein_aaemp_19','emp_max_19',
                  #'janmar_emp_ratio', 'aprjun_emp_ratio', 'julsep_emp_ratio', 'octdec_emp_ratio',
                  #'janmar_wages_ratio', 'aprjun_wages_ratio', 'julsep_wages_ratio', 'octdec_wages_ratio',
                  'D_eidl_grant', 'D_eidl_loan', 'emp_m', 'wage_m','Dppp2021',
                  'pct_emp_1', 'pct_emp_2', 'emp_avg','wage_avg',
                  'Dppp_employment_verify','poverty_cut', kwvar, 'hhi_cut','hhi_cut2',
'Dppp_employment_verify_80_99','fipscnty','naics','Deidlloan20_month', 'Deidlgrant20_month', 'Dppp21_month',
'emp_20tot', 'emp_jan_feb20'
                  ]
                 )) + closedmons

kcols = [c for c in kcols if c in df]

### need to fill in 0 for these
for c in [ 'D_eidl_grant', 'D_eidl_loan','Dppp2021']:
    df[c] = df[c].fillna(0)

print("checking to see how many observations are lost by dropping nulls")
print("number of missings for each column")
print(df[kcols].isnull().sum())
print(len(df))
othercols = ['LoanAmount_ein', 'LoanAmount', 'pct_emp_1', 'pct_emp_2', 'emp_avg', 'amount_final', 'fipscnty']
df = df[kcols].dropna(subset = [i for i in kcols if i not in othercols]).drop_duplicates(['ldb_num', 'num_month'])
print(len(df))

#dummy for if state is high/low replacement rate
dfui = pd.read_csv(dataloc + 'rona/ui_replacement_rates_ganongetal.txt',sep='\t').iloc[:-1].rename(columns = {'State' : 'state'})
dfui['state'] = dfui['state'].apply(lambda x: x.strip())
for c in list(dfui.columns)[1:]:
    dfui[c] = pd.to_numeric(dfui[c].apply(lambda x: float(x.replace('%','').replace('(','').replace(')',''))))

dfui['fipsstate'] = dfui['state'].apply(lambda x: stateinfo(x.strip(), 'fips'))
bins = [-1, 141, 154, 1e999]
wagelist = ['<141%', '141-154%', '154%+']
dfui['replacement_bins']  = pd.cut(dfui['replacement rate with FPUC'], bins=bins, labels=wagelist)

tdict = dict(zip(dfui['fipsstate'], dfui['replacement_bins']))
df['replacement_bins'] = df['fips'].map(tdict)

df['num_obs'] = df[['ldb_num', 'fips']].groupby('ldb_num')['fips'].transform('count')

# print("checking to see how many observations are lost with the full 15 month cutoff")
# print(len(df))
totobs = q*3
# kcond = (df['num_obs'] == totobs)
# df = df.loc[kcond]
# print(len(df))

'''
markers and edits for robustness checks
'''
###dummy for if LDB ever has a 0 avg month, to be used for alternative specification
cond = (df['emp_avg'] == 0)
df['D0month'] = np.where(cond, 1,0)
df['D0month'] = df.groupby('ldb_num')['D0month'].transform("sum")
cond = (df['D0month'] == 0)
df['D0month'] = np.where(cond, 1, 0)
###dummy for if pct_emp_1 is valid for all months
cond = (df['pct_emp_1'].notnull())
df['Dpct_emp_1'] = np.where(cond, 1,0)
df['Dpct_emp_1'] = df.groupby('ldb_num')['Dpct_emp_1'].transform("sum")
cond = (df['Dpct_emp_1'] == totobs)
df['Dpct_emp_1'] = np.where(cond, 1, 0)
###dummy for if pct_emp_1 is valid for all months
cond = (df['pct_emp_2'].notnull())
df['Dpct_emp_2'] = np.where(cond, 1,0)
df['Dpct_emp_2'] = df.groupby('ldb_num')['Dpct_emp_2'].transform("sum")
cond = (df['Dpct_emp_2'] == totobs)
df['Dpct_emp_2'] = np.where(cond, 1, 0)


###save
###dfces.to_csv(dataloc + 'pppfiles/' + filename + '_cudfversion.csv')
#### get sample cutoffs
ldblist = df['ldb_num'].unique().tolist()
samplist = random.sample(ldblist, int(len(ldblist) * .01 / (samplecut)))
cond = (df['ldb_num'].isin(samplist))
df['D1pctsample'] = np.where(cond, 1, 0)
samplist = random.sample(ldblist, int(len(ldblist) * .05 / (samplecut)))
cond = (df['ldb_num'].isin(samplist))
df['D5pctsample'] = np.where(cond, 1, 0)
samplist = random.sample(ldblist, int(len(ldblist) * .10 / (samplecut)))
cond = (df['ldb_num'].isin(samplist))
df['D10pctsample'] = np.where(cond, 1, 0)

###key dep vars
for c in ['pct_wage','pct_emp', 'Dclosed','Dppp_employment_verify']:
    df['D'+c] = 1

df['Dclosedsum'] = df.groupby('ldb_num')['Dclosed'].transform('sum')
cond = (df['Dclosedsum'] == 0)
df['Dpct_wage2'] = np.where(cond, 1, 0)
df['pct_wage2'] = df['pct_wage'].copy()

'''
****************************************
****************************************
this ties in cnty-naics4 employment and wage
****************************************
****************************************
'''
df['naics4'] = df['naics'].astype('str').str[:4].astype('float')
#df['zip5'] = pd.to_numeric(df['zip5'], errors='coerce')
mc = ['fipscnty', 'naics4']
dft = []
for i in range(1,moncount):
    renamedict = {'emp_m' : 'zn_emp',
                  'emp_19' : 'zn_emp19',}
    dft1 = df[df['num_month'] == i].merge(pd.read_csv(dataloc + 'pppfiles/' + 'CRppp_cntynaicsv1_'+ str(i)+'.csv').drop_duplicates(subset = ['fipscnty', 'naics4']).rename(columns = renamedict), on=mc, how='inner')
    renamedict = {'wage_m': 'zn_wage',
                  'wages_19': 'zn_wage19',}
    dft1 = dft1.merge(pd.read_csv(dataloc + 'pppfiles/' + 'CRppp_cntynaicsv1_wage'+ str(i)+'.csv').drop_duplicates(subset = ['fipscnty', 'naics4']).rename(columns = renamedict), on=mc, how='inner')
    dft1['cntynaics_emp'] = ((dft1['zn_emp'] - dft1['emp_m']) / (dft1['zn_emp19'] - dft1['emp_avg'])).replace([np.inf,-np.inf],1)
    #divide 2019 wage by 3 because it's quarterly
    dft1['cntynaics_wage'] = ((dft1['zn_wage'] - dft1['wage_m']) / ((dft1['zn_wage19']/3) - dft1['wage_avg'])).replace([np.inf, -np.inf],1)
    for c in ['cntynaics_emp', 'cntynaics_wage']:
        cond = (dft1[c] < 0 )
        dft1[c] = np.where(cond, 1, dft1[c])
    dft.append(dft1)
    dft1 = None

###just so these don't get dropped, filling in 1 if zip code is missing
###already dropping missing county above
cond = (df['fipscnty'].isnull())
df['Dmissingcnty'] = np.where(cond, 1, 0)
#add in missing zip codes
dft.append(df[df['Dmissingcnty'] == 1])


dcols = ['zn_emp', 'zn_emp19', 'zn_wage', 'zn_wage19', 'wage_m', 'emp_m']
df = pd.concat(dft).drop(columns = dcols).drop_duplicates(['ldb_num', 'num_month'])
dft = None
###just so these don't get dropped, filling in 1 if zip code is missing
cond = (df['fipscnty'].isnull())
df['Dmissingcnty'] = np.where(cond, 1, 0)
for c in  ['cntynaics_emp', 'cntynaics_wage']:
    df[c] = np.where(cond, 1, df[c])

kcond = ((df['cntynaics_emp'].isnull()) | (df['cntynaics_wage'].isnull()))
df = df[~kcond]

'''
***********************************************************
************************************************************************
adding in telework measure
***********************************************************
************************************************************************
'''

##read in results
ttfname = 'ANqbs2_v2_q3teleworkv3v3'
tfname = mostrecentfile(ttfname+'.csv', '/dataERS/eract/daltonm/results/qbs/')
dft= pd.read_csv(tfname)


for c in ['ft_post', 'ft_pre', 'some_post', 'some_pre', 'ft_during', 'some_during']:
    cond = (dft[c] < .0000001)
    dft[c] = np.where(cond, 0, dft[c])



dft['occ5'] = dft['occupation'].str[5:-5]
###gives telework prediction from Matt-Mark_etc.

###get estimates
###some is assumed to be half-time
dft['prepandemic_takeup'] = (dft['ft_pre'] + .5 * dft['some_pre']).apply(lambda x: min(1,x))
dft['postpandemic_takeup'] = (dft['ft_post'] + .5 * dft['some_post']).apply(lambda x: min(1,x))
dft['duringpandemic_takeup'] = (dft['ft_during'] + .5 * dft['some_during']).apply(lambda x: min(1,x))
###get SOC name
socs = pd.read_csv(dataloc + 'oesreference/soc_text.txt', sep = '###', header=None)
socs = socs[socs[0].str[-1:] == '0']
tdict2 = dict(zip(socs[0].str[:6].str.replace('-', '_'), socs[1]))
dft['occ_name'] = dft['occ5'].map(tdict2)
dft['change_takeup'] = dft['postpandemic_takeup'] - dft['prepandemic_takeup']
###2018 employment
dft1 = cudf.read_csv(dataloc + "qbs/ldboes20172.csv").groupby(['LDB_NUM','OCC_CODE_2017'], as_index=False)['emp'].sum().reset_index().rename(columns = {'LDB_NUM' : 'ldb_num'})
dft1['occ5'] = dft1['OCC_CODE_2017'].str[:-1]
dft1 = dft1.groupby([ 'ldb_num', 'occ5'], as_index=False)['emp'].sum().reset_index()
dft1['occ5'] = dft1['occ5'].to_pandas().str.replace('-', '_')
#merge
dft = dft[['occ5','prepandemic_takeup', 'postpandemic_takeup',
       'duringpandemic_takeup']].merge(dft1.to_pandas(), on = ['occ5'], how = 'right').rename(columns = {'emp' : 'emp_2018'})

###get % teleworking
dft['emp_tele'] = dft['emp_2018'] * dft['duringpandemic_takeup']
dft = dft.groupby('ldb_num', as_index=False)[['emp_2018', 'emp_tele']].sum()
dft['pct_tele'] = dft['emp_tele'] / dft['emp_2018']
bins = [-9e9,.01, .05, .1, .25,.5,9e9]
labs = ['0-1pct', '1-5pct', '5-10pct', '10-25pct', '25-50pct', '50+pct']
dft['tele_bins'] = pd.cut(dft['pct_tele'], bins=bins, labels = labs)

mc = ['ldb_num']
df = df.merge(dft[['ldb_num', 'tele_bins']].drop_duplicates(mc), on = mc, how = 'left')
dft = None

'''
************************************************************************
************************************************************************
save it
************************************
************************************
'''
#uset his list for the 2019 data
df[['ldb_num']].drop_duplicates('ldb_num').to_csv(dataloc + 'pppfiles/' + filename + '_ldbnums.csv')
df.to_csv(dataloc + 'pppfiles/' + filename + '.csv')
print('full data saved')
#ldblist = cudf.read_csv(dataloc + 'pppfiles/' + filename + '_ldbnums.csv')
samplist = random.sample(ldblist, int(3e4))
cond = (df['ldb_num'].isin(samplist))
df['Dsmallsample'] = np.where(cond, 1, 0)

df.loc[(df['Dsmallsample']==1)].to_csv(dataloc + 'pppfiles/' + filename + '_small.csv')

